###################################################
### Project: Issue Competition in Parliamentary ###
###          Speeches?                          ###
### Title:   Classification                     ###
### Author:  Christoph Ivanusch                 ###
###################################################

# preparation

## clear global environment
rm(list = ls()) 

## load packages
library(newsmap)
library(quanteda)
library(stringi)
library(dplyr)
library(tidyr)

## load preprocessed data
load("Preprocession_Data.RDA")

## load Seed-Word Dictionary
seedword_dict <- dictionary(list(Greeting = c("geehrt*", "Hohes Haus", "Kolleg*", "Herren", "Damen", "Geschätzter"),
                                 Labor_Welfare = c("Beschäftigung*", "Arbeit*", "Beruf*", "Pension*", "Gehalt*", "Lohn*","Gewerkschaft*", "Konsumentensch*", "Produktsicherheit*", "Armut*", "Wohlfahrt*", "sozial*", "Einkommen", "Sozial*", "Mindestsicherung*"),
                                 EU_ForeignAffairs = c("EU*", "Europäische* Union", "Europapolitik", "Mitgliedsstaat*", "EZB*", "Brüssel*", "Kommission*", "Außenpolit*", "außenpolit*", "Diplomat*", "UN", "Entwicklungs*"),
                                 Housing = c("Bau", "Wohnbau*", "Miete*", "Wohnung*", "Sanierung*", "Hausbau*","Häuser*", "Immobilie*"),
                                 Finances = c("Finanz*", "Budget*", "Kredit*", "Schulden*", "Bank*", "Ausgabe*", "Einsparung*", "Steuer*", "Rechnungshof*"),
                                 Family_Youth = c("Famili*", "Kind*", "Eltern*", "Geschwist*", "Jugend*"),
                                 Science = c("Wissenschaft*", "Forsch*", "Universitä*","Labor*"),
                                 Education = c("Bildung*","Unterricht*","Schul*", "Schüler*", "Studen*", "Universitä*", "Kindergart*", "Lehr*", "Ausbildung*", "Studierend*", "Studiengebühr*"),
                                 Health = c("Gesund*", "Krank*", "Spital*", "Medizin*", "medizin*", "Patient*", "Rett*", "Arznei*", "Apothek*", "Arzt", "Ärzt*", "Pflege*"),
                                 Women = c("Frauen*", "Gleichbehandlung*", "Gleichstell*", "Gender*", "Ungleichbehandlung*", "Diskrimin*"),
                                 Security_Crime = c("Krimin*", "Sicherheit*", "Straf*", "Polizei*", "Mord*", "Gewalt*", "Waffe*", "Dieb*", "Exekutive", "Polizist*"),
                                 JudicialSystem = c("Justiz*", "Gericht*", "Richter*", "Staatsanwalt*", "Strafprozess*", "Gefängnis*", "Haft*", "Verfassung*", "Volksanwalt*"),
                                 Parliament = c("Ausschuss*", "U-Ausschuss*", "Geschäftsord*", "Ordnungsruf*", "Tagesordnung*", "parlament*", "Parlament*", "Untersuchungsausschuss"),
                                 Culture = c("Kunst*", "Künst*", "Kultur*", "künstler*", "ORF", "Film*", "Theat*", "Schauspiele*", "aufführ*", "Musik*", "Galerie*"),
                                 Defense = c("Verteidig*", "Heer*", "Bundesheer*", "Mili*", "Soldat*", "Waffe*", "Luftwaff*", "Trupp*", "Grundwehr*", "Präsenzdien*"),
                                 Agriculture = c("Landwirt*", "Bauer*", "Forst*", "Wald*", "Agrar*", "Lebensmittel*", "Pflanzen*", "Bäuerin*"),
                                 CivilRights = c("Menschenrecht*", "Bürgerrecht*", "Meinungsfreiheit*", "Pressefreiheit*", "Versammlungsfreiheit*", "Informationsfreiheit*", "Wahlrecht*", "Grundfreiheit*", "Grundrecht*"),
                                 Sports = c("Sport*", "Olympia*", "Training*", "trainier*"),
                                 Economy_Energy = c("Wirtschaft*", "wirtschaftl*", "Industrie", "Tourismus*", "touristisch*", "BIP*", "Inflation*", "Energie*", "Gas*", "Unternehme*", "Betrieb*", "wirtschaftlich*", "Firma", "Firmen*"),
                                 Environment = c("Umwelt*", "Natur*", "Klima*", "Luftver*", "Feinstaub*", "Emission*", "Treibhaus*", "Nachhaltig*", "ökolog*"),
                                 Immigration = c("Migra*", "Integration*", "integrier*","Flüchtling*", "Ausländ*","Asyl*", "multikult*"),
                                 Transportation = c("Infrastruktur*", "Verkehr*", "Transport*", "Transit*", "ÖBB", "ASFINAG", "Straße*", "Fahrzeug*", "PKW*", "LKW*", "Züge*", "Bahn*", "Auto*", "Flug*")))


# perform newsmap-analysis

## create dfmat_label and dfmat_feat_select
toks_label <- tokens_lookup(toks, dictionary = seedword_dict)
dfmat_label <- dfm(toks_label, tolower = FALSE)

dfmat_feat <- dfm(toks, tolower = FALSE)
dfmat_feat_select <- dfm_select(dfmat_feat, pattern = "^[A-Z][A-Za-z1-2]+", valuetype = 'regex', 
                                case_insensitive = FALSE) %>% dfm_trim(min_termfreq = 10)

## get textmodel
tmod_nm <- textmodel_newsmap(dfmat_feat_select, y = dfmat_label)

## predict (without contextual smoothing)
pred_nm <- predict(tmod_nm)
head(pred_nm, 20)

## predict (with contextual smoothing)

### preparation
dfmat_feat_select@docvars$docid_ <- as.numeric(gsub("text", "", dfmat_feat_select@docvars$docid_))
num_texts <- length(unique(dfmat_feat_select@docvars$docid_))

### smoothing

#### initialize output vector
pred_nm_smoothed <- c()

#### prepare prediction without smoothing for texts, which are too short for smoothing
pred2 <- pred_nm 
names(pred2) <- gsub("[[:punct:]][[:digit:]]{1,}", "", names(pred2))
names(pred2) <- as.numeric(gsub("text", "", names(pred2)))

#### for-loop performing smoothing
for(i in 1:num_texts) {
  
  if(i %% 1000 == 0) {
    print(i)
  }
  
  tmp <- dfm_subset(dfmat_feat_select, dfmat_feat_select@docvars$docid_ == i)
  pred <- as.matrix(predict(tmod_nm, newdata = tmp, type = "all"))
  pred[is.na(pred)] <- 0
  
  ##### perform smoothing for texts with more than 10 sentences, otherwise keep pred_nm
  if(nrow(pred) >= 10) {
    
    ##### get sentences, which are classified as "greeting" and exclude them from the smoothing process
    ##### Explanation: this is necessary as greetings or direct addresses are often used as a stylistic device between substantial sentences; including these in the smoothing process deteriorates the classification
    idx_greet <- which(apply(pred, 1, which.max) == 1) #get those sentences, which are predicted  as "greeting"
    pred[idx_greet] <- 0 #set sentences predicted as "greeting" to 0 so that they are not included in smoothing process
    smooth <- kernapply(pred, kernel("daniell", 2))
    idx <- apply(smooth, 1, which.max)
    res <- colnames(smooth[, idx])
    #names(res) <- names(idx)

    ###### add first and last sentence to res (from pred) as they get lost in smoothing process
    rest <- subset(pred2, names(pred2) == i)
    names_correct <- seq(from = 1, to = length(rest), by = 1)
    names(rest) <- paste0(names(rest), ".", names_correct)
    res <- c(as.character(rest[1:2]), as.character(res), as.character(rest[(length(rest)-1):length(rest)]))
    
  } else {
    
    res <- subset(pred2, names(pred2) == i)
    names_correct <- seq(from = 1, to = length(res), by = 1)
    names(res) <- paste0(names(res), ".", names_correct)
    res <- as.character(res)
    
  }
  
  pred_nm_smoothed <- c(pred_nm_smoothed, res)
}

names(pred_nm_smoothed) <- paste0("text", names(pred_nm_smoothed))


# save both predictions in data frame
corp_sent_attr <- attributes(corp_sent)
meta <- corp_sent_attr$docvars
texts <- unclass(corp_sent)
texts <- as.character(texts)
df_classification <- cbind(meta, text = texts, pred = pred_nm, pred_smoothed = pred_nm_smoothed)

# save df_analysis as RDS-file
saveRDS(df_classification, "Classification_Data.RDS")
